I used R to clean the data. Please refer to R file if you would like to see more details.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
The unprocessed raw data looks like below:
uncleaned = pd.read_csv('Data for Case Study Data Science Online Courses.csv')
def overview(uncleaned):
overview = pd.DataFrame(uncleaned.dtypes, columns=['Data Types'])
overview = overview.reset_index()
overview['Unique Values'] = uncleaned.nunique().values
overview['Missing Values'] = uncleaned.isnull().sum().values
return overview
overview(uncleaned)
Characters:
Numerical Variables:
Other Strategy:
data = pd.read_csv('Cleaned_Data Science Online Courses.csv', index_col = 0)
overview(data)
We can see that there is no missing value after cleaning the data and all of the column names look good.
Business insights can be found in the report slides.
Strategies:
Converteddata.info()
data.describe()
data.hist(bins = 30, figsize = (20,20), color = '#1169D4')
We can see the following variables have only one value, which means it is useless for further analysis and model building so we drop them.
'DSForums', 'Digital Advertisement', 'Do Not Call', 'Do Not Email', 'Magazine', 'Newspaper',
'Newspaper Article', 'PayThrough', 'ReceiveUpdates', 'Search', 'Through Recommendations', 'UpdateDScontent','UpdateDMcontent'
drop_list = ['DSForums', 'Digital Advertisement', 'Do Not Call', 'Do Not Email', 'Magazine', 'Newspaper',
'Newspaper Article', 'PayThrough', 'ReceiveUpdates', 'Search', 'Through Recommendations', 'UpdateDScontent',
'UpdateDMcontent']
data = data.drop(columns = drop_list)
Asymmetrique Activity/Profile Index
Asymmetrique Activity/Profile Score
fig, axs = plt.subplots(2,2, figsize = (15,10))
plt1 = sns.countplot(data['Asymmetrique Activity Index'], ax = axs[0,0])
plt2 = sns.boxplot(data['Asymmetrique Activity Score'], ax = axs[0,1])
plt3 = sns.countplot(data['Asymmetrique Profile Index'], ax = axs[1,0])
plt4 = sns.boxplot(data['Asymmetrique Profile Score'], ax = axs[1,1])
plt.tight_layout()
I decide to delete those values because:
1) There are too much variation
2) After Googling thoese index/score, I found these values are created by an Indian advertising agency but there is no documentation explaining their meaning and no further analysis can be made due to lack of background information.
drop_list2 = ['Asymmetrique Activity Index','Asymmetrique Activity Score',
'Asymmetrique Profile Index','Asymmetrique Profile Score']
data = data.drop(columns = drop_list2)
converted = data[data['Converted'] == 1]
nonconverted = data[data['Converted'] == 0]
print("Total =", len(data))
print("Number of leads who converted =", len(converted))
print("Percentage of leads who converted =", 1.*len(converted)/len(data)*100.0, "%")
print("Number of leads who did not convert =", len(nonconverted))
print("Percentage of leads who did not convert =", 1.*len(nonconverted)/len(data)*100.0, "%")
sns.countplot(data['Converted']);
# KDE describes the probability density at different values in a continuous variable.
plt.figure(figsize=(12,7))
sns.kdeplot(converted['TotalVisits'], label = 'leads who converted', shade = True, color = 'r')
sns.kdeplot(nonconverted['TotalVisits'], label = 'leads who did not convert', shade = True, color = 'b')
plt.xlabel('TotalVisits')
When the number of Total visits is less than 3, there are more leads who did not convert than converted.
plt.figure(figsize=(12,7))
sns.kdeplot(converted['TotalTime'], label = 'leads who converted', shade = True, color = 'r')
sns.kdeplot(nonconverted['TotalTime'], label = 'leads who did not convert', shade = True, color = 'b')
plt.xlabel('TotalTime')
When Total Time is less than 600, there are more leads who did not convert. However, when it is more than 600, leads tend to convert.
plt.figure(figsize=(12,7))
sns.kdeplot(converted['PPV'], label = 'leads who converted', shade = True, color = 'r')
sns.kdeplot(nonconverted['PPV'], label = 'leads who did not convert', shade = True, color = 'b')
plt.xlabel('TotalTime')
There seems to be little difference between leads who converted and those who did not convert in various Total Time.
def count(df, v1, v2):
ctr = df[[v1, v2]].groupby(v1, as_index=False).mean().sort_values(v2, ascending=False)
count = df[[v1, v2]].groupby(v1, as_index=False).count().sort_values(v2, ascending=False)
merge = count.merge(ctr, on=v1, how='left')
merge.columns=[v1, 'Count', 'Converted Rate(%)']
return merge
def crosstab (df, features, target):
"""Plot the bar char of count by converted"""
for feature in features:
pd.crosstab(df[feature],df[target]).plot(kind='barh', figsize=(13,8), stacked=True)
plt.title('Number of '+ feature+' by '+ target)
plt.xlabel('Count')
plt.ylabel(feature)
# Display the table obove each chart
return count(df, feature, target)
crosstab(data, ['Lead Origin'], 'Converted')
crosstab(data, ['Lead Source'], 'Converted')
# There are google and Google so we need to fix the spelling issue
data.loc[data['Lead Source'] == 'google', 'Lead Source'] = 'Google'
# Put the new label for labels with few results so as to avoid data shift between train
# and test data.
data.loc[data['Lead Source'] == 'bing', 'Lead Source'] = 'Others'
data.loc[data['Lead Source'] == 'Click2call', 'Lead Source'] = 'Others'
data.loc[data['Lead Source'] == 'Press_Release', 'Lead Source'] = 'Others'
data.loc[data['Lead Source'] == 'Social Media', 'Lead Source'] = 'Others'
data.loc[data['Lead Source'] == 'Live Chat', 'Lead Source'] = 'Others'
data.loc[data['Lead Source'] == 'WeLearn', 'Lead Source'] = 'Others'
data.loc[data['Lead Source'] == 'Pay per Click Ads', 'Lead Source'] = 'Others'
data.loc[data['Lead Source'] == 'NC_EDM', 'Lead Source'] = 'Others'
data.loc[data['Lead Source'] == 'blog', 'Lead Source'] = 'Others'
data.loc[data['Lead Source'] == 'testone', 'Lead Source'] = 'Others'
data.loc[data['Lead Source'] == 'welearnblog_Home', 'Lead Source'] = 'Others'
data.loc[data['Lead Source'] == 'youtubechannel', 'Lead Source'] = 'Others'
crosstab(data, ['Lead Source'], 'Converted')
crosstab(data, ['Last Activity'], 'Converted')
data.loc[data['Last Activity'] == 'Had a Phone Conversation', 'Last Activity'] = 'Others'
data.loc[data['Last Activity'] == 'Approached upfront', 'Last Activity'] = 'Others'
data.loc[data['Last Activity'] == 'View in browser link Clicked', 'Last Activity'] = 'Others'
data.loc[data['Last Activity'] == 'Email Received', 'Last Activity'] = 'Others'
data.loc[data['Last Activity'] == 'Email Marked Spam', 'Last Activity'] = 'Others'
data.loc[data['Last Activity'] == 'Resubscribed to emails', 'Last Activity'] = 'Others'
data.loc[data['Last Activity'] == 'Visited Booth in Tradeshow', 'Last Activity'] = 'Others'
crosstab(data, ['Last Activity'], 'Converted')
crosstab(data, ['Country'], 'Converted')
To be dropped because almost all of the leads come from India.
crosstab(data, ['City'], 'Converted')
crosstab(data, ['Specialization'], 'Converted')
# Reduce redundancy by merging similar categories
data.loc[data['Specialization'] == 'Finance Management', 'Specialization'] = 'Finance'
data.loc[data['Specialization'] == 'Banking, Investment And Insurance', 'Specialization'] = 'Finance'
data.loc[data['Specialization'] == 'Human Resource Management', 'Specialization'] = 'Human Resource'
data.loc[data['Specialization'] == 'Operations Management', 'Specialization'] = 'Operations and Supply Chain'
data.loc[data['Specialization'] == 'Supply Chain Management', 'Specialization'] = 'Operations and Supply Chain'
data.loc[data['Specialization'] == 'Marketing Management', 'Specialization'] = 'Marketing'
data.loc[data['Specialization'] == 'Media and Advertising', 'Specialization'] = 'Marketing'
data.loc[data['Specialization'] == 'Business Administration', 'Specialization'] = 'Business'
data.loc[data['Specialization'] == 'International Business', 'Specialization'] = 'Business'
data.loc[data['Specialization'] == 'E-Commence', 'Specialization'] = 'Business'
data.loc[data['Specialization'] == 'E-Business', 'Specialization'] = 'Business'
data.loc[data['Specialization'] == 'Hospitality Management', 'Specialization'] = 'Tourism and Hospitality'
data.loc[data['Specialization'] == 'Services Excellence', 'Specialization'] = 'Tourism and Hospitality'
data.loc[data['Specialization'] == 'Travel and Tourism', 'Specialization'] = 'Tourism and Hospitality'
data.loc[data['Specialization'] == 'Healthcare Management', 'Specialization'] = 'Healthcare'
data.loc[data['Specialization'] == 'Retail Management', 'Specialization'] = 'Retail'
data.loc[data['Specialization'] == 'IT Projects Management', 'Specialization'] = 'IT'
crosstab(data, ['Specialization'], 'Converted')
crosstab(data, ['HowHear'], 'Converted')
# Reduce redundancy by merging similar categories
data.loc[data['HowHear'] == 'Student of SomeSchool', 'HowHear'] = 'Word Of Mouth'
data.loc[data['HowHear'] == 'Other', 'HowHear'] = 'Others'
crosstab(data, ['HowHear'], 'Converted')
crosstab(data, ['CurrentOccupation'], 'Converted')
data.loc[data['CurrentOccupation'] == 'Other', 'CurrentOccupation'] = 'Others'
data.loc[data['CurrentOccupation'] == 'Businessman', 'CurrentOccupation'] = 'Working Professional'
data.loc[data['CurrentOccupation'] == 'Housewife', 'CurrentOccupation'] = 'Unemployed'
crosstab(data, ['CurrentOccupation'], 'Converted')
crosstab(data, ['WhatMatters'], 'Converted')
data.loc[data['WhatMatters'] == 'Flexibility & Convenience', 'WhatMatters'] = 'Others'
data.loc[data['WhatMatters'] == 'Other', 'WhatMatters'] = 'Others'
crosstab(data, ['WhatMatters'], 'Converted')
crosstab(data, ['Tags'], 'Converted')
# Reduce redundancy by merging similar categories
data.loc[data['Tags'] == 'wrong number given', 'Tags'] = 'invalid number or not provided'
data.loc[data['Tags'] == 'number not provided', 'Tags'] = 'invalid number or not provided'
data.loc[data['Tags'] == 'Already a student', 'Tags'] = 'Current Student'
data.loc[data['Tags'] == 'Graduation in progress', 'Tags'] = 'Current Student'
data.loc[data['Tags'] == 'Lateral student', 'Tags'] = 'Current Student'
data.loc[data['Tags'] == 'switched off', 'Tags'] = 'No Response'
data.loc[data['Tags'] == 'Busy', 'Tags'] = 'No Response'
data.loc[data['Tags'] == 'opp hangup', 'Tags'] = 'No Response'
data.loc[data['Tags'] == 'Interested in Next batch', 'Tags'] = 'Insterested'
data.loc[data['Tags'] == 'Shall take in the next coming month', 'Tags'] = 'Insterested'
data.loc[data['Tags'] == 'In confusion whether part time or DLP', 'Tags'] = 'Have Question'
data.loc[data['Tags'] == 'Want to take admission but has financial problems', 'Tags'] = 'Have Question'
data.loc[data['Tags'] == 'Recognition issue (DEC approval)', 'Tags'] = 'Have Question'
data.loc[data['Tags'] == 'University not recognized', 'Tags'] = 'Have Question'
data.loc[data['Tags'] == 'invalid number', 'Tags'] = 'invalid number or not provided'
data.loc[data['Tags'] == 'Interested in other courses', 'Tags'] = 'Insterested'
data.loc[data['Tags'] == 'Still Thinking', 'Tags'] = 'Insterested'
data.loc[data['Tags'] == 'Lost to EINS', 'Tags'] = 'Lost'
data.loc[data['Tags'] == 'Lost to Others', 'Tags'] = 'Lost'
crosstab(data, ['Tags'], 'Converted')
crosstab(data, ['Lead Quality'], 'Converted')
crosstab(data, ['Lead Profile'], 'Converted')
data.loc[data['Lead Profile'] == 'Select', 'Lead Profile'] = 'Others'
data.loc[data['Lead Profile'] == 'Other Leads', 'Lead Profile'] = 'Others'
crosstab(data, ['Lead Profile'], 'Converted')
crosstab(data, ['FreeCopy'], 'Converted')
crosstab(data, ['LastNotableActivity'], 'Converted')
Last Activity and LastNotableActivity are similar columns so delete the latter one.
Delete Country, WhatMatters, FreeCopy as well because these columns have nearly one value and they are useless for further analysis.
drop_list3 = ['LastNotableActivity', 'Country', 'WhatMatters', 'FreeCopy']
data = data.drop(columns = drop_list3)
data.shape
data.info()
# Store the newly cleaned data file
# data.to_csv('Newly Cleaned Data Science Online Courses.csv')
data = pd.read_csv('Newly Cleaned Data Science Online Courses.csv', index_col = 0)
sns.heatmap(data.corr(),annot=True,cmap='RdYlGn',linewidths=0.2)
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.show()
Altough TotalVisits and PPV are highly correlated(0.77), I decide not to drop them now because they are very important metrics for further analysis.
TotalVisits represent the exposure while PPV shows the enagement. PPV is particularly useful for online course website performance evaluation because visitors tend to make purchases when PPV is high.
I will wait to see if there is overfitting problem.
Actually in the later logistic regression, PPV is dropped eventually due to high P value.
# State the character columns
categ_cols = data.columns[data.dtypes == np.dtype('O')].to_list()
categ_cols
onehot_data = pd.get_dummies(data[categ_cols], drop_first = True)
data = pd.concat([data, onehot_data], axis=1)
data = data.drop(categ_cols, axis=1)
data.head()
print('The size of the total dataset is {}'.format(data.shape))
from sklearn.model_selection import train_test_split
X = data.drop(columns = ['Converted'])
y = data['Converted']
X_train, X_test, y_train, y_test, data_train, data_test = train_test_split(
X,
y,
data,
test_size = 0.2,
stratify = y,
shuffle = True
)
print('Shape of X_train: {}'.format(X_train.shape))
print('Shape of X_test: {}'.format(X_test.shape))
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numeric_features = ['TotalVisits',
'TotalTime',
'PPV']
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.fit_transform(X_test[numeric_features])
Functions for Perfomance Metric:
I assume that the current objective is to sell diploma-related courses.
To not waste any sales resources, I prefer a classifier that rejects many leads (relatively low recall) but keeps only the best true hot leads (high precision).
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
def flat_accuracy(pred, true):
acc = (pred == y_test).sum() / y_test.shape[0]
print('Accuracy = %.4f'% (acc*100) + '%')
def flat_precision(pred, true):
precision = precision_score(true, pred, average='macro') # Due to unbalance, we prefer to use macro recall
print('Precision = %.4f'% (precision*100) + '%')
def flat_recall(pred, true):
recall = recall_score(true, pred, average='macro') # Due to unbalance, we prefer to use macro recall
print('Recall = %.4f'% (recall*100) + '%')
def flat_f1(pred, true):
f1 = f1_score(true, pred, average='macro')
print('F1 = %.4f'% (f1*100) + '%')
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import average_precision_score
def plot_cm(true, pred, model=''):
# Confusion matrix
cm_matrix = cm(true, pred)
# Accuracy
accuracy = (true == pred).sum() / len(true)
plt.clf()
plt.imshow(cm_matrix, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0', '1']
plt.title('Validation Data %s - Accuracy %.4f' % (model, accuracy))
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
for i in range(2):
for j in range(2):
plt.text(j,i, str(cm_matrix[i][j]))
plt.show()
import matplotlib.pyplot as plt
def plot_prc(classifier, X_test, y_test, y_pred):
average_precision = average_precision_score(y_test, y_pred)
disp = plot_precision_recall_curve(classifier, X_test, y_test)
disp.ax_.set_title('2-class Precision-Recall curve: '
'AP={0:0.2f}'.format(average_precision))
from catboost import CatBoostClassifier;
# We don't need to put any parameters.
# To suppress CatBoost iteration results and make this files readable,
# I put 'Silent' in logging_level
classifier_CatBoost = CatBoostClassifier(logging_level = 'Silent')
classifier_CatBoost.fit(X_train, y_train)
y_pred_CatBoost = classifier_CatBoost.predict(X_test)
flat_accuracy(y_pred_CatBoost, y_test)
flat_precision(y_pred_CatBoost, y_test)
flat_recall(y_pred_CatBoost, y_test)
flat_f1(y_pred_CatBoost, y_test)
plot_cm(y_pred_CatBoost, y_test, 'CatBoost')
plot_prc(classifier_CatBoost, X_test, y_test, y_pred_CatBoost)
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred_CatBoost)
Take a look at the feature importance
featureImp = []
for feat, importance in zip(X.columns, classifier_CatBoost.feature_importances_):
temp = [feat, importance]
featureImp.append(temp)
fT_df = pd.DataFrame(featureImp, columns = ['Feature', 'Importance'])
fT_df.sort_values('Importance', ascending = False).head(15)
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier_CatBoost, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
from sklearn.ensemble import RandomForestClassifier
Classifier_RF = RandomForestClassifier(criterion='entropy', n_estimators=200, n_jobs = -1)
parameters_RF = {'max_depth': [14,16,18],
'min_samples_leaf': [1,2]}
from sklearn.model_selection import GridSearchCV
grid_search_RF = GridSearchCV(Classifier_RF, parameters_RF, cv=5, verbose=2, n_jobs=-1,
scoring='roc_auc_ovr_weighted')
grid_search_RF.fit(X_train, y_train)
grid_search_RF.best_params_
Classifier_RF = RandomForestClassifier(criterion='entropy', n_estimators=200, min_samples_split=4,
max_depth=16, min_samples_leaf=1, n_jobs = -1)
Classifier_RF.fit(X_train, y_train)
y_pred_RF = Classifier_RF.predict(X_test)
flat_accuracy(y_pred_RF, y_test)
flat_precision(y_pred_RF, y_test)
flat_recall(y_pred_RF, y_test)
flat_f1(y_pred_RF, y_test)
plot_cm(y_pred_RF, y_test, 'Random Forest')
plot_prc(Classifier_RF, X_test, y_test, y_pred_RF)
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = Classifier_RF, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
from xgboost import XGBClassifier
Classifier_XGBoost = XGBClassifier(booster = 'gbtree')
parameters_XG = {'n_estimators': range(200, 500, 700),
'max_depth': [35,45,55],
'learning_rate': [0.1, 0.01, 0.05]
}
grid_search_XG = GridSearchCV(Classifier_XGBoost, parameters_XG, cv=5, verbose=2, n_jobs=-1,
scoring='roc_auc_ovr_weighted')
grid_search_XG.fit(X_train, y_train)
grid_search_XG.best_params_
Classifier_XG = XGBClassifier(booster = 'gbtree', max_depth=35, n_estimators=200,
learning_rate=0.05, n_jobs=-1)
Classifier_XG.fit(X_train, y_train)
y_pred_XG = Classifier_XG.predict(X_test)
flat_accuracy(y_pred_XG, y_test)
flat_precision(y_pred_XG, y_test)
flat_recall(y_pred_XG, y_test)
flat_f1(y_pred_XG, y_test)
plot_cm(y_pred_XG, y_test, 'Random Forest')
plot_prc(Classifier_XG, X_test, y_test, y_pred_XG)
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = Classifier_XG, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
Strategies:
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
classifier_LR = LogisticRegression(random_state = 0)
from sklearn.feature_selection import RFE
rfe = RFE(classifier_LR, 200) # running RFE with 200 variables as output
rfe = rfe.fit(X_train, y_train)
list(zip(X_train.columns, rfe.support_, rfe.ranking_))
col = X_train.columns[rfe.support_]
X_train.columns[~rfe.support_];
X_train_sm_1 = sm.add_constant(X_train[col])
lr1 = sm.GLM(y_train, X_train_sm_1, family = sm.families.Binomial())
res = lr1.fit()
res.summary()
from statsmodels.stats.outliers_influence import variance_inflation_factor
VIF_rank = pd.DataFrame()
VIF_rank['Features'] = X_train[col].columns
VIF_rank['VIF'] = [variance_inflation_factor(X_train[col].values, i)
for i in range(X_train[col].shape[1])]
VIF_rank['VIF'] = round(VIF_rank['VIF'], 2)
VIF_rank = VIF_rank.sort_values(by = "VIF", ascending = False)
VIF_rank.head(30)
# pd.set_option('display.max_rows', None);
col1 = col.drop(['Lead Profile_Others','HowHear_Others', 'Lead Source_Facebook',
'Lead Origin_Lead Import&Quick Add Form', 'Lead Origin_Lead Add Form',
'Lead Source_Reference', 'Lead Profile_Potential Lead',
'Tags_Others', 'Lead Quality_Others', 'Specialization_Others'], 1)
X_train_sm_2 = sm.add_constant(X_train[col1])
lr2 = sm.GLM(y_train, X_train_sm_2, family = sm.families.Binomial())
res = lr2.fit()
res.summary()
VIF_rank = pd.DataFrame()
VIF_rank['Features'] = X_train[col1].columns
VIF_rank['VIF'] = [variance_inflation_factor(X_train[col1].values, i)
for i in range(X_train[col1].shape[1])]
VIF_rank['VIF'] = round(VIF_rank['VIF'], 2)
VIF_rank = VIF_rank.sort_values(by = "VIF", ascending = False)
VIF_rank.head(20)
# Drop the features with P values that are larger than 0.05
# Drop the features with VIF scores that are larger than 3
col2 = col1.drop(['Lead Source_Olark Chat', 'Lead Source_Organic Search','Lead Source_Others',
'Lead Source_Referral Sites', 'Lead Source_Welingak Website',
'Last Activity_Email Bounced','Last Activity_Form Submitted on Website',
'Last Activity_Olark Chat Conversation','Last Activity_Unsubscribed',
'Specialization_Finance','Specialization_Healthcare',
'Specialization_IT','Specialization_IT','Specialization_IT',
'Specialization_IT','Specialization_IT','Specialization_IT',
'Specialization_IT', 'Specialization_Marketing',
'Specialization_Operations and Supply Chain',
'Specialization_Retail','Specialization_Rural and Agribusiness',
'HowHear_Email', 'HowHear_Multiple Sources',
'Tags_Not doing further education', 'Tags_invalid number or not provided',
'Lead Quality_Might be','Lead Profile_Lateral Student',
'Lead Profile_Student of SomeSchool', 'City_Other Cities',
'City_Other Cities of Maharashtra', 'City_Other Metro Cities',
'City_Thane & Outskirts', 'City_Tier II Cities'], 1)
X_train_sm_3 = sm.add_constant(X_train[col2])
lr3 = sm.GLM(y_train, X_train_sm_3, family = sm.families.Binomial())
res = lr3.fit()
res.summary()
VIF_rank = pd.DataFrame()
VIF_rank['Features'] = X_train[col2].columns
VIF_rank['VIF'] = [variance_inflation_factor(X_train[col2].values, i) for i in range(X_train[col2].shape[1])]
VIF_rank['VIF'] = round(VIF_rank['VIF'], 2)
VIF_rank = VIF_rank.sort_values(by = "VIF", ascending = False)
VIF_rank
col3 = col2.drop(['CurrentOccupation_Unemployed', 'Lead Origin_Landing Page Submission',
'PPV','Last Activity_Unreachable','Specialization_Human Resource',
'Specialization_Tourism and Hospitality','HowHear_SMS',
'HowHear_Social Media','Lead Quality_Not Sure'],1)
X_train_sm_4 = sm.add_constant(X_train[col3])
lr4 = sm.GLM(y_train, X_train_sm_4, family = sm.families.Binomial())
res = lr4.fit()
res.summary()
VIF_rank = pd.DataFrame()
VIF_rank['Features'] = X_train[col3].columns
VIF_rank['VIF'] = [variance_inflation_factor(X_train[col3].values, i) for i in range(X_train[col3].shape[1])]
VIF_rank['VIF'] = round(VIF_rank['VIF'], 2)
VIF_rank = VIF_rank.sort_values(by = "VIF", ascending = False)
VIF_rank
col4 = col3.drop(['HowHear_Online Search',
'HowHear_Word Of Mouth',
'CurrentOccupation_Student',
'Tags_Diploma holder (Not Eligible)',
'Tags_Have Question',
'Tags_in touch with EINS',
'Last Activity_Others'], 1)
X_train_sm_5 = sm.add_constant(X_train[col4])
lr5 = sm.GLM(y_train, X_train_sm_5, family = sm.families.Binomial())
res = lr5.fit()
res.summary()
VIF_rank = pd.DataFrame()
VIF_rank['Features'] = X_train[col4].columns
VIF_rank['VIF'] = [variance_inflation_factor(X_train[col4].values, i) for i in range(X_train[col4].shape[1])]
VIF_rank['VIF'] = round(VIF_rank['VIF'], 2)
VIF_rank = VIF_rank.sort_values(by = "VIF", ascending = False)
VIF_rank
Performance Evaluation
X_test_sm_5 = sm.add_constant(X_test[col4])
y_pred_LR_probability = res.predict(X_test_sm_5)
y_pred_LR_probability
y_pred_LR_probability = y_pred_LR_probability.values.reshape(-1)
y_pred_LR_probability[:10]
import numpy as np
np.set_printoptions(suppress=True);
y_pred_LR = (y_pred_LR_probability > 0.5).astype(int)
y_pred_LR
flat_accuracy(y_pred_LR, y_test)
flat_precision(y_pred_LR, y_test)
flat_recall(y_pred_LR, y_test)
flat_f1(y_pred_LR, y_test)
plot_cm(y_pred_LR, y_test, 'Logistic Regression')
# Features Coefficients
pd.options.display.float_format = '{:.2f}'.format
fit_parameters = res.params[1:]
fit_parameters
Logistic Regression Model Equation
logit(p) = log(p/(1-p))= β0 + β1* X1 + … + βn * Xn
Put the coefficient numbers from the best model to the logistic regression, and then we get the following equation.
Findings:
The regression coefficients show the change in log(odds) in Converted for a unit change in the predictor variable, holding all other predictor variables constant.
In this sense, the above euqation can be interpreted in this way:
Total Time is associated with an increase in the odds of conversion by a factor of 2.5=exp(0.92), about an 150% increase, hold everything else constant.CurrentOccupation_Working Professional having converted is about exp(1.69)=7.1, 610% higher than the odds of non-working professional doing so, hold everything else constant. occupation, working professional is positively associated with conversion.last activity, leads who sent SMS, opened/clicked Email, or visited website pages are positively related with conversion.Tags with education info are all negatively related with conversion. It implies that Diploma-related program didn’t sell well.Lead quality label correctly reflects conversion.time on website while total visits are not necessarily the case. After comparing these four models, I find that CatBoost wins so I will use CatBoost model to predict conversion probability and then build lead scoring system.
from catboost import CatBoostClassifier;
classifier_CatBoost = CatBoostClassifier(logging_level = 'Silent')
classifier_CatBoost.fit(X_train, y_train)
y_pred_CatBoost = classifier_CatBoost.predict(X_test)
y_pred_CatBoost
# Get predicted probability
y_pred_CatBoost_pro = classifier_CatBoost.predict_proba(X_test)
y_pred_CatBoost_pro =[i[1] for i in y_pred_CatBoost_pro]
# Create a dataframe with the actual conversion, Predicted Probabilities and Predicted Conversion
Lead_Scoring = pd.DataFrame({'Conversion':y_test.values, 'Conversion Probability':y_pred_CatBoost_pro})
Lead_Scoring['Prediction Class'] = y_pred_CatBoost
Lead_Scoring.head(10)
Conversion Probability multiplies by 100 and then get the lead score.
Lead_Scoring['Lead Score'] = Lead_Scoring['Conversion Probability'].map( lambda x: round(x*100))
Lead_Scoring.head()
# Define a function for putting lead score into four buckets
def lead_bucket(x):
if x<25:
return "Cold"
elif x>=25 and x<50:
return "Cool"
elif x>=51 and x<76:
return "Warm"
else:
return "Hot"
Lead_Scoring['Lead Buckets'] = Lead_Scoring['Lead Score'].apply(lead_bucket)
Change the order of the Columns so that first column is the actual conversion, the second one is the predicted conversion for better observation.
Lead_Scoring = Lead_Scoring[['Converted', 'Prediction Class', 'Conversion Probability',
'Lead Score', 'Lead Buckets']]
Lead_Scoring.tail(20)
Thanks!